Code and results. Full report at:
NB This is a bare bones .Rmd file. Please see the report for full details.
This run assumes:
Load the data - sourced from https://reshare.ukdataservice.ac.uk/853334/
dataPath <- "~/temp/"
# half-hourly total household consumption (pre-prepared)
totalF <- paste0(dataPath, "halfHourImputedTotalDemand_Fixed.csv.gz")
# half-hourly lighting consumption
lightingF <- paste0(dataPath, "halfHourLighting.csv.gz")
# half-hourly heat pump consumption
heatPumpF <- paste0(dataPath, "halfHourHeatPump.csv.gz")
# household attribute data
hhF <- paste0(dataPath, "ggHouseholdAttributesSafe_2019-04-09.csv.gz")
Now prep and check the data
# > power ----
totalDT <- data.table::fread(totalF)
setkey(totalDT, linkID)
uniqueN(totalDT$linkID)
## [1] 40
heatPumpDT <- data.table::fread(heatPumpF)
uniqueN(heatPumpDT$linkID)
## [1] 29
setkey(heatPumpDT, linkID)
lightingDT <- data.table::fread(lightingF)
uniqueN(lightingDT$linkID)
## [1] 23
setkey(lightingDT, linkID)
powerDataPrep <- function(dt){
dt[, meanConsumptionkWh := (meanPowerW/2)/1000]
dt[, r_as_dateTime := lubridate::as_datetime(r_dateTimeHalfHour)]
dt[, r_dateTimeNZ := lubridate::with_tz(r_as_dateTime, "Pacific/Auckland")]
dt[, r_date := lubridate::as_date(r_dateTimeNZ)]
dt[, r_halfHour := hms::as_hms(r_dateTimeNZ)]
dt <- addNZSeason(dt, dateTime = "r_dateTimeNZ")
return(dt)
}
totalDT <- powerDataPrep(totalDT)
heatPumpDT <- powerDataPrep(heatPumpDT)
lightingDT <- powerDataPrep(lightingDT)
Check that the time of day demand profiles look OK.
# check - beware which hemisphere we are in?
table(totalDT$month, totalDT$season)
##
## Spring Summer Autumn Winter
## Jan 0 121570 0 0
## Feb 0 113668 0 0
## Mar 0 0 128231 0
## Apr 0 0 138259 0
## May 0 0 138250 0
## Jun 0 0 0 141182
## Jul 0 0 0 151549
## Aug 0 0 0 138193
## Sep 127962 0 0 0
## Oct 130546 0 0 0
## Nov 120549 0 0 0
## Dec 0 119124 0 0
testDT <- totalDT[, .(meankWh = mean(meanConsumptionkWh)),
keyby = .(r_halfHour, season)]
ggplot2::ggplot(testDT, aes(x = r_halfHour, y = meankWh, colour = season)) +
geom_line() +
labs(caption = "Whole household kWh")
testDT <- lightingDT[, .(meankWh = mean(meanConsumptionkWh)),
keyby = .(r_halfHour, season)]
ggplot2::ggplot(testDT, aes(x = r_halfHour, y = meankWh, colour = season)) +
geom_line() +
labs(caption = "Lighting kWh where known")
# looks OK
message("# half-hour: whole household kWh")
## # half-hour: whole household kWh
summary(totalDT)
## linkID circuit r_dateTimeHalfHour
## Length:1569083 Length:1569083 Min. :2014-01-06 03:00:00
## Class :character Class :character 1st Qu.:2015-06-07 10:00:00
## Mode :character Mode :character Median :2016-03-10 10:00:00
## Mean :2016-04-27 11:10:59
## 3rd Qu.:2017-03-09 23:30:00
## Max. :2018-08-01 11:30:00
##
## nObs meanPowerW sdPowerW minPowerW
## Min. : 1.00 Min. :-7627.7 Min. : 0.00 Min. :-12407.5
## 1st Qu.:30.00 1st Qu.: 260.5 1st Qu.: 62.35 1st Qu.: 127.6
## Median :30.00 Median : 527.7 Median : 147.06 Median : 252.4
## Mean :30.26 Mean : 894.8 Mean : 410.87 Mean : 427.9
## 3rd Qu.:30.00 3rd Qu.: 1259.9 3rd Qu.: 706.25 3rd Qu.: 529.9
## Max. :60.00 Max. :10981.6 Max. :4954.00 Max. : 9600.6
## NA's :92
## maxPowerW meanConsumptionkWh r_as_dateTime
## Min. :-6970.9 Min. :-3.8138 Min. :2014-01-06 03:00:00
## 1st Qu.: 395.5 1st Qu.: 0.1302 1st Qu.:2015-06-07 10:00:00
## Median : 1003.4 Median : 0.2639 Median :2016-03-10 10:00:00
## Mean : 1702.7 Mean : 0.4474 Mean :2016-04-27 11:10:59
## 3rd Qu.: 2746.7 3rd Qu.: 0.6299 3rd Qu.:2017-03-09 23:30:00
## Max. :15593.2 Max. : 5.4908 Max. :2018-08-01 11:30:00
##
## r_dateTimeNZ r_date r_halfHour
## Min. :2014-01-06 16:00:00 Min. :2014-01-06 Length:1569083
## 1st Qu.:2015-06-07 22:00:00 1st Qu.:2015-06-07 Class1:hms
## Median :2016-03-10 23:00:00 Median :2016-03-10 Class2:difftime
## Mean :2016-04-27 23:10:59 Mean :2016-04-27 Mode :numeric
## 3rd Qu.:2017-03-10 12:30:00 3rd Qu.:2017-03-10
## Max. :2018-08-01 23:30:00 Max. :2018-08-01
##
## month season
## Jul :151549 Spring:379057
## Jun :141182 Summer:354362
## Apr :138259 Autumn:404740
## May :138250 Winter:430924
## Aug :138193
## Oct :130546
## (Other):731104
Process and check household data
# > household ----
hhDT <- data.table::fread(hhF)
hhDT <- code_Q7(hhDT)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hhDT[, hasPV := `PV Inverter`]
hhDT[, hasPV := ifelse(hasPV == "", "No", hasPV)]
# check
uniqueN(hhDT$linkID)
## [1] 44
uniqueN(totalDT$linkID)
## [1] 40
setkey(hhDT, linkID)
setkey(totalDT, linkID)
#> data checks ----
# any zeros & negative numbers?
hist(totalDT$meanConsumptionkWh)
# some
# check if aggregated to daily sum
dailyAllDT <- totalDT[, .(sumkWh = sum(meanConsumptionkWh)), keyby = .(r_date, linkID)]
hist(dailyAllDT$sumkWh)
# still some neg - probably due to PV?
# check if aggregated to daily mean
dt <- dailyAllDT[, .(meankWh = mean(sumkWh),
sdkWh = sd(sumkWh)), keyby = .(linkID)]
hist(dt$meankWh)
# still some
Check if the negative values are to do with PV
# need to check -ve = mid-day, if not is not PV must just be errors?
totalDT[, testValues := "> 0"]
totalDT[, testValues := ifelse(meanConsumptionkWh == 0, "0", testValues)]
totalDT[, testValues := ifelse(meanConsumptionkWh < 0, "< 0", testValues)]
testDT <- totalDT[hhDT[, .(linkID, hasPV)]]
dt <- testDT[testValues == "< 0", .(nValues = .N),
keyby = .(r_halfHour, season, linkID, testValues,hasPV)]
p <- ggplot2::ggplot(dt[!is.na(testValues) & !is.na(season)], aes(x = r_halfHour, y = nValues, colour = linkID)) +
geom_line() +
facet_grid(season ~ testValues + hasPV) +
labs(x = "Half hour",
y = "N",
caption = "Colours = individual dwellings, legend omitted for clarity"
) +
theme(legend.position="none")
p
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plotly::ggplotly(p)
# so:
# a) neg values indicate PV
# b) at least 1 dwelling had PV but didn't say so in survey - rf_06
hhDT[, hasPVfixed := ifelse(linkID == "rf_06", "yes", hasPV)]
hhDT[, .(n = .N), keyby = .(hasPV, hasPVfixed)]
## hasPV hasPVfixed n
## 1: No No 39
## 2: No yes 1
## 3: yes yes 4
# If we only care about network load we could keep all values > 0 only
# If we want total energy input then we need grid draw + PV input which we might be able
# to calculate from the PV circuit W - grid export
# but it gets complicated.
# So for now we'll leave out the dwellinmgs which seem to have PV
setkey(dailyAllDT, linkID)
dailyDT <- dailyAllDT[hhDT[hasPVfixed == "No"]]
dailyMeanDT <- dailyDT[!is.na(sumkWh), .(meankWh = mean(sumkWh, na.rm = TRUE),
sdkWh = sd(sumkWh, na.rm = TRUE)), keyby = .(linkID)]
setkey(dailyMeanDT, linkID)
#> final data ----
dailyMeanLinkedDT <- dailyMeanDT[hhDT]
dailyMeanLinkedDT <- dailyMeanLinkedDT[!is.na(meankWh)]
make_p95Table <- function(dt,groupVar, kWh = "meankWh"){
# aggregates kWh
# remember that kWh could be defined in a range of ways (daily sum, mean etc)
t95 <- dt[!is.na(Q7labAgg), .(meankWh = mean(get(kWh)),
minkWh = min(get(kWh)),
maxkWh = max(get(kWh)),
sdkWh = sd(get(kWh)),
nHouseholds = uniqueN(linkID)),
keyby = .(category = get(groupVar))]
setnames(t95, c("category"), groupVar)
t90 <- copy(t95) # has to be a copy
t95[, Threshold := "p < 0.05 (observed n)"]
t95[, se := sdkWh/sqrt(nHouseholds)]
t95[, error := qnorm(0.975)*se]
t95[, CI_lower := meankWh - error]
t95[, CI_upper := meankWh + error]
t90[, Threshold := "p < 0.1 (observed n)"]
t90[, se := sdkWh/sqrt(nHouseholds)]
t90[, error := qnorm(0.95)*se]
t90[, CI_lower := meankWh - error]
t90[, CI_upper := meankWh + error]
return(rbind(t90, t95))
}
make_CIplot <- function(t, kwh = "meankWh", xVar, xLab){
# plots whatever kWh is by xVar
p <- ggplot2::ggplot(obsT, aes(x = get(xVar), y = kWh, fill = Threshold)) +
geom_col() +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
facet_wrap(Threshold ~ .) +
labs(x = xLab,
y = "Mean daily kWh"
) +
theme(legend.position="bottom")
ggplot2::ggsave(paste0(xVar,"_CI.png"), p,
width = 6, path = here::here("plots"))
return(p)
}
Compare pre 2000 with after
obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(Q7labAgg) & !is.na(meankWh)],
groupVar = "Q7labAgg")
obsT[, kWh := meankWh]
make_CIplot(obsT, xVar = "Q7labAgg", xLab = "Dwelling age")
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
kable_styling()
| Q7labAgg | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper | kWh |
|---|---|---|---|---|---|---|---|---|---|---|---|
| < 1999 | 23.253 | 9.921 | 45.924 | 10.094 | 22 | p < 0.1 (observed n) | 2.152 | 3.540 | 19.713 | 26.793 | 23.253 |
| >= 2000 | 18.721 | 6.298 | 27.650 | 7.151 | 10 | p < 0.1 (observed n) | 2.261 | 3.720 | 15.001 | 22.441 | 18.721 |
| < 1999 | 23.253 | 9.921 | 45.924 | 10.094 | 22 | p < 0.05 (observed n) | 2.152 | 4.218 | 19.035 | 27.471 | 23.253 |
| >= 2000 | 18.721 | 6.298 | 27.650 | 7.151 | 10 | p < 0.05 (observed n) | 2.261 | 4.432 | 14.288 | 23.153 | 18.721 |
# What 'effect size' do we observe?
m1 <- obsT[Q7labAgg == "< 1999" & Threshold %like% "0.05", meankWh]
m2 <- obsT[Q7labAgg == ">= 2000" & Threshold %like% "0.05", meankWh]
# common sd??
r <- lm(meankWh ~ Q7labAgg, data = dailyMeanLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse
diff <- abs(m1-m2)
# Difference
diff
## [1] 4.532435
d <- diff/rse
n1 <- obsT[Q7labAgg == "< 1999" & Threshold %like% "0.05", nHouseholds]
n2 <- obsT[Q7labAgg == ">= 2000" & Threshold %like% "0.05", nHouseholds]
p1 <- n1/(n1 + n2) # with
# what effect size would we need for the GG n? p = 0.05
pwr95 <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95$d
## [1] 1.104225
# which means a kWh difference of
pwr95$d * rse
## [1] 10.27974
# what effect size would we need for the GG n? p = 0.1
pwr90 <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.1, power = 0.8)
pwr90
##
## t test power calculation
##
## n1 = 22
## n2 = 10
## d = 0.9704298
## sig.level = 0.1
## power = 0.8
## alternative = two.sided
# which means a kWh difference of
pwr90$d * rse
## [1] 9.034182
# what effect size could we get with HEEPfull? p = 0.05
n1 <- p1 * HEEP2full
n2 <- HEEP2full - n1
pwr95_HEEP2full <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2full
##
## t test power calculation
##
## n1 = 192.5
## n2 = 87.5
## d = 0.3624643
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2full$d * rse
## [1] 3.374348
# what effect size could we get with HEEPfullAvail? p = 0.05
n1 <- p1 * HEEP2fullAvail
n2 <- HEEP2fullAvail - n1
pwr95_HEEP2fullAvail <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2fullAvail
##
## t test power calculation
##
## n1 = 481.25
## n2 = 218.75
## d = 0.2287704
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2fullAvail$d * rse
## [1] 2.12973
# what effect size could we get with HEEP2pool? p = 0.05
# HEEP2pool obtained - see table
n1 <- p1 * HEEP2poolOb
n2 <- HEEP2poolOb - n1
pwr95_HEEP2poolOb <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolOb
##
## t test power calculation
##
## n1 = 1925
## n2 = 875
## d = 0.1142672
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolOb$d * rse
## [1] 1.063766
# report table
kableExtra::kable(obsT, digits = 2) %>%
kable_styling()
| Q7labAgg | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper | kWh |
|---|---|---|---|---|---|---|---|---|---|---|---|
| < 1999 | 23.25 | 9.92 | 45.92 | 10.09 | 22 | p < 0.1 (observed n) | 2.15 | 3.54 | 19.71 | 26.79 | 23.25 |
| >= 2000 | 18.72 | 6.30 | 27.65 | 7.15 | 10 | p < 0.1 (observed n) | 2.26 | 3.72 | 15.00 | 22.44 | 18.72 |
| < 1999 | 23.25 | 9.92 | 45.92 | 10.09 | 22 | p < 0.05 (observed n) | 2.15 | 4.22 | 19.04 | 27.47 | 23.25 |
| >= 2000 | 18.72 | 6.30 | 27.65 | 7.15 | 10 | p < 0.05 (observed n) | 2.26 | 4.43 | 14.29 | 23.15 | 18.72 |
# GG
t <- table(hhDT$`Heat pump number`, useNA = "always")
t
##
## 1 3 <NA>
## 23 2 19
prop.table(t)
##
## 1 3 <NA>
## 0.52272727 0.04545455 0.43181818
# with elec data
dailyMeanLinkedDT[, heatPumps := `Heat pump number`]
dailyMeanLinkedDT[, heatPumps := ifelse(is.na(`Heat pump number`), 0, heatPumps)]
dailyMeanLinkedDT[, .(nHHs = uniqueN(linkID)), keyby = .(heatPumps)]
## heatPumps nHHs
## 1: 0 14
## 2: 1 19
## 3: 3 1
obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(heatPumps)], "heatPumps")
obsT[, kWh := meankWh]
make_CIplot(obsT, kwh = "meankWh", xVar = "heatPumps", xLab = "N heat pumps")
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
kable_styling()
| heatPumps | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper | kWh |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19.646 | 6.298 | 34.445 | 8.083 | 13 | p < 0.1 (observed n) | 2.242 | 3.687 | 15.959 | 23.333 | 19.646 |
| 1 | 22.693 | 9.921 | 45.924 | 10.032 | 18 | p < 0.1 (observed n) | 2.365 | 3.890 | 18.804 | 26.583 | 22.693 |
| 3 | 34.898 | 34.898 | 34.898 | NA | 1 | p < 0.1 (observed n) | NA | NA | NA | NA | 34.898 |
| 0 | 19.646 | 6.298 | 34.445 | 8.083 | 13 | p < 0.05 (observed n) | 2.242 | 4.394 | 15.252 | 24.040 | 19.646 |
| 1 | 22.693 | 9.921 | 45.924 | 10.032 | 18 | p < 0.05 (observed n) | 2.365 | 4.635 | 18.059 | 27.328 | 22.693 |
| 3 | 34.898 | 34.898 | 34.898 | NA | 1 | p < 0.05 (observed n) | NA | NA | NA | NA | 34.898 |
# switch to binary for HPs - yes or no
dailyMeanLinkedDT[, hasHeatPump := "No"]
dailyMeanLinkedDT[, hasHeatPump := ifelse(heatPumps > 0, "Yes", hasHeatPump)]
obsT <- make_p95Table(dailyMeanLinkedDT[!is.na(hasHeatPump)], "hasHeatPump")
# report table
kableExtra::kable(obsT, digits = 2) %>%
kable_styling()
| hasHeatPump | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper |
|---|---|---|---|---|---|---|---|---|---|---|
| No | 19.65 | 6.30 | 34.44 | 8.08 | 13 | p < 0.1 (observed n) | 2.24 | 3.69 | 15.96 | 23.33 |
| Yes | 23.34 | 9.92 | 45.92 | 10.14 | 19 | p < 0.1 (observed n) | 2.33 | 3.83 | 19.51 | 27.16 |
| No | 19.65 | 6.30 | 34.44 | 8.08 | 13 | p < 0.05 (observed n) | 2.24 | 4.39 | 15.25 | 24.04 |
| Yes | 23.34 | 9.92 | 45.92 | 10.14 | 19 | p < 0.05 (observed n) | 2.33 | 4.56 | 18.77 | 27.90 |
# What 'effect size' do we observe?
m1 <- obsT[hasHeatPump == "No" & Threshold %like% "0.05", meankWh]
m2 <- obsT[hasHeatPump == "Yes" & Threshold %like% "0.05", meankWh]
# common sd??
r <- lm(meankWh ~ hasHeatPump, data = dailyMeanLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse
diff <- abs(m1-m2)
# Difference
diff
## [1] 3.689753
d <- diff/rse
n1 <- obsT[hasHeatPump == "No" & Threshold %like% "0.05", nHouseholds]
n2 <- obsT[hasHeatPump == "Yes" & Threshold %like% "0.05", nHouseholds]
# what effect size would we need for the GG n? p = 0.05
pwr95 <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95
##
## t test power calculation
##
## n1 = 13
## n2 = 19
## d = 1.042146
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
pwr95$d
## [1] 1.042146
# which means a kWh difference of
pwr95$d * rse
## [1] 9.540305
# what effect size would we need for the GG n? p = 0.1
pwr90 <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.1, power = 0.8)
pwr90
##
## t test power calculation
##
## n1 = 13
## n2 = 19
## d = 0.9158508
## sig.level = 0.1
## power = 0.8
## alternative = two.sided
# which means a kWh difference of
pwr90$d * rse
## [1] 8.384138
# what effect size could we get with HEEP2full? p = 0.05
# Use HCS proportions
# HCS: 45% owned, 27% renters, overall = 38% (Vicki White and Mark Jones 2017)
n1 <- HEEP2full - (HEEP2full*0.38)
n2 <- HEEP2full*0.38
pwr95_HEEP2full <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2full
##
## t test power calculation
##
## n1 = 173.6
## n2 = 106.4
## d = 0.3461309
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
#HEEP2full
# which means a kWh difference of
pwr95_HEEP2full$d * rse
## [1] 3.168648
# for drawing plot
getDrange <- function(nList,p){
dt <- data.table::data.table()
for(n in nList){
n2 <- n * p # p = the proportion that have X
n1 <- n - n2
pres <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
res <- as.data.table(c(n1,n2,n, pres$d))
dt <- rbind(dt,transpose(res))
}
return(dt)
}
nList <- seq(100,3000,50)
p <- 0.38
resDT <- getDrange(nList, p)
resDT[, kWhDiff := V4 * rse]
pl <- ggplot2::ggplot(resDT, aes(x = V3, y = kWhDiff)) +
geom_line() +
geom_point() +
geom_vline(xintercept = HEEP2full, alpha = 0.3) +
geom_vline(xintercept = HEEP2fullAvail, alpha = 0.3) +
geom_vline(xintercept = HEEP2poolOb, alpha = 0.3) +
labs(x = "Total sample size",
y = "Mean kWh difference",
caption = paste0("p = 0.05, power = 0.8\n",
"% with heat pump = ",100*p))
pl
ggplot2::ggsave("kWhDiff_rangeHeatPumps.png", pl,
width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
Now calculate the different d & kWh for the sample sizes
# what effect size could we get with HEEP2poolAvail? p = 0.05
# HEEP2pool obtained - see table
n1 <- HEEP2fullAvail - (HEEP2fullAvail*0.38)
n2 <- HEEP2fullAvail*0.38
pwr95_HEEP2poolAvail <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolAvail
##
## t test power calculation
##
## n1 = 434
## n2 = 266
## d = 0.2184447
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolAvail$d * rse
## [1] 1.999747
# what effect size could we get with HEEP2poolObtained? p = 0.05
# HEEP2pool obtained - see table
n1 <- HEEP2poolOb - (HEEP2poolOb*0.38)
n2 <- HEEP2poolOb*0.38
pwr95_HEEP2poolOb <- pwr::pwr.t2n.test(n1 = n1,
n2 = n2,
d = , sig.level = 0.05, power = 0.8)
pwr95_HEEP2poolOb
##
## t test power calculation
##
## n1 = 1736
## n2 = 1064
## d = 0.1091407
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
# which means a kWh difference of
# HEEP2pool
pwr95_HEEP2poolOb$d * rse
## [1] 0.9991266
# >> compare am/pm ----
# use HP only data
# remove -ve values
heatPumpDT <- heatPumpDT[meanConsumptionkWh >= 0]
# check heat pump patterns
plotDT <- heatPumpDT[, .(meankWh = mean(meanConsumptionkWh)),
keyby = .(r_halfHour, season)]
ggplot2::ggplot(plotDT, aes(x = r_halfHour, y = meankWh, colour = season)) +
geom_line()
# Based on the line plot...
heatPumpDT[, period := ifelse(lubridate::hour(r_dateTimeHalfHour) >= 4 &
lubridate::hour(r_dateTimeHalfHour) < 10,
"04:00 - 10:00", NA)]
heatPumpDT[, period := ifelse(lubridate::hour(r_dateTimeHalfHour) >= 16 &
lubridate::hour(r_dateTimeHalfHour) < 22,
"16:00 - 22:00", period)]
# check
with(heatPumpDT, table(lubridate::hour(r_dateTimeHalfHour),period))
## period
## 04:00 - 10:00 16:00 - 22:00
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 52173 0
## 5 52200 0
## 6 52249 0
## 7 52390 0
## 8 52491 0
## 9 52273 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 0 0
## 16 0 52081
## 17 0 52114
## 18 0 52126
## 19 0 52142
## 20 0 52102
## 21 0 52079
## 22 0 0
## 23 0 0
dailyHeatPumpDT <- heatPumpDT[!is.na(period), .(meankWh = mean(meanConsumptionkWh), # use mean as some have multiple circuits
nObs = .N), # how many obs?
keyby = .(r_date, period, linkID, season)]
# check
dailyHeatPumpDT[, .(meanSum = mean(meankWh)),
keyby = .(season, period)]
## season period meanSum
## 1: Spring 04:00 - 10:00 0.07821803
## 2: Spring 16:00 - 22:00 0.06663372
## 3: Summer 04:00 - 10:00 0.05652707
## 4: Summer 16:00 - 22:00 0.02865547
## 5: Autumn 04:00 - 10:00 0.08025595
## 6: Autumn 16:00 - 22:00 0.05777627
## 7: Winter 04:00 - 10:00 0.23322393
## 8: Winter 16:00 - 22:00 0.17475496
ggplot2::ggplot(dailyHeatPumpDT, aes(y = meankWh, x = period)) +
geom_boxplot() +
facet_wrap(season ~ .)
dailyMeanHeatPumpDT <- dailyHeatPumpDT[, .(meankWh = mean(meankWh),
sdkWh = sd(meankWh)),
keyby = .(linkID, period, season)]
dailyMeanHeatPumpDT[, .(mean = mean(meankWh)),
keyby = .(season, period)]
## season period mean
## 1: Spring 04:00 - 10:00 0.07676568
## 2: Spring 16:00 - 22:00 0.07691729
## 3: Summer 04:00 - 10:00 0.04709558
## 4: Summer 16:00 - 22:00 0.02294221
## 5: Autumn 04:00 - 10:00 0.07036280
## 6: Autumn 16:00 - 22:00 0.06122734
## 7: Winter 04:00 - 10:00 0.22114839
## 8: Winter 16:00 - 22:00 0.17957456
setkey(dailyMeanHeatPumpDT, linkID)
dailyMeanHeatPumpDTLinkedDT <- dailyMeanHeatPumpDT[hhDT]
obsT <- make_p95Table(dailyMeanHeatPumpDTLinkedDT[!is.na(period)], groupVar = "period")
obsT[, kWh := meankWh]
p <- make_CIplot(obsT[season == "winter"], # winter only
kwh = "meankWh", xVar = "period", xLab = "Period")
## Saving 6 x 5 in image
p <- p + coord_flip() + labs(y = "Mean kWh")
ggplot2::ggsave(paste0("heatPumpByPeriod_Winter_CI.png"), p,
width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
kable_styling()
| period | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper | kWh |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 04:00 - 10:00 | 0.104 | 0 | 0.483 | 0.112 | 27 | p < 0.1 (observed n) | 0.021 | 0.035 | 0.069 | 0.139 | 0.104 |
| 16:00 - 22:00 | 0.077 | 0 | 0.563 | 0.094 | 27 | p < 0.1 (observed n) | 0.018 | 0.030 | 0.048 | 0.107 | 0.077 |
| 04:00 - 10:00 | 0.104 | 0 | 0.483 | 0.112 | 27 | p < 0.05 (observed n) | 0.021 | 0.042 | 0.062 | 0.146 | 0.104 |
| 16:00 - 22:00 | 0.077 | 0 | 0.563 | 0.094 | 27 | p < 0.05 (observed n) | 0.018 | 0.035 | 0.042 | 0.113 | 0.077 |
Now power calcs - needs paired
# >> Power calcs - paired ----
# What 'effect size' do we observe?
m1 <- obsT[period == "04:00 - 10:00" & Threshold %like% "0.05", meankWh]
m2 <- obsT[period != "04:00 - 10:00" & Threshold %like% "0.05", meankWh]
# common sd??
# strictly speaking we have paired observations so is this correct?
r <- lm(meankWh ~ period, data = dailyMeanHeatPumpDTLinkedDT)
results <- summary(r) # we need the rse https://online.stat.psu.edu/stat462/node/94/
rse <- results$sigma # rse
diff <- abs(m1-m2)
# Difference
diff
## [1] 0.02644616
d <- diff/rse
getPairedD <- function(nList,rse){
dt <- data.table::data.table()
for(n in nList){
pres <- pwr::pwr.t.test(n = n,
d = , sig.level = 0.05, power = 0.8,
type = c("paired"))
res <- as.data.table(c(n, pres$d, pres$d * rse))
dt <- rbind(dt,transpose(res))
}
return(dt)
}
nList <- c(uniqueN(dailyMeanHeatPumpDT$linkID), HEEP2full,HEEP2fullAvail,HEEP2poolOb)
getPairedD(nList, rse) # print out the d and kWh diff required for each n
## V1 V2 V3
## 1: 29 0.53895500 0.05675347
## 2: 280 0.16800541 0.01769144
## 3: 700 0.10601270 0.01116343
## 4: 2800 0.05295149 0.00557594
nList <- seq(50,1000,50)
pairedDT <- getPairedD(nList, rse) # print out the d and kWh diff required for each n
pl <- ggplot2::ggplot(pairedDT, aes(x = V1, y = V3)) +
geom_line() +
geom_point() +
labs(x = "Sub-group sample size",
y = "Mean kWh difference",
caption = paste0("p = 0.05, power = 0.8"))
pl
ggplot2::ggsave("kWhDiff_rangeHeatPumpsSubgroups.png", pl,
width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
# GG
t <- table(hhDT$Q49_coded, useNA = "always")
t
##
## Energy saving - cfl Halogen Incandescent
## 2 24 3 9
## LED <NA>
## 6 0
prop.table(t)
##
## Energy saving - cfl Halogen Incandescent
## 0.04545455 0.54545455 0.06818182 0.20454545
## LED <NA>
## 0.13636364 0.00000000
# need to use lighting extract
dailyLightingDT <- lightingDT[, .(sumkWh = sum(meanConsumptionkWh)),
keyby = .(r_date, linkID)]
dailyMeanLightingDT <- dailyLightingDT[, .(meankWh = mean(sumkWh),
sdkWh = sd(sumkWh)),
keyby = .(linkID)]
setkey(dailyMeanLightingDT, linkID)
dailyMeanLightingLinkedDT <- dailyMeanLightingDT[hhDT]
dailyMeanLightingLinkedDT[, .(nHHs = uniqueN(linkID)), keyby = .(Q49_coded)]
## Q49_coded nHHs
## 1: 2
## 2: Energy saving - cfl 24
## 3: Halogen 3
## 4: Incandescent 9
## 5: LED 6
dailyMeanLightingLinkedDT <- dailyMeanLightingLinkedDT[Q49_coded != "" &
!is.na(meankWh)]
obsT <- make_p95Table(dailyMeanLightingLinkedDT[!is.na(Q49_coded)], groupVar = "Q49_coded")
obsT[, kWh := meankWh]
p <- make_CIplot(obsT, kwh = "meankWh", xVar = "Q49_coded", xLab = "Main lumen type")
## Saving 6 x 5 in image
p <- p + coord_flip()
ggplot2::ggsave(paste0("Q49_coded_CI.png"), p,
width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
kableExtra::kable(obsT, digits = 3) %>%
kable_styling()
| Q49_coded | meankWh | minkWh | maxkWh | sdkWh | nHouseholds | Threshold | se | error | CI_lower | CI_upper | kWh |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Energy saving - cfl | 2.015 | 0.110 | 8.562 | 2.308 | 12 | p < 0.1 (observed n) | 0.666 | 1.096 | 0.920 | 3.111 | 2.015 |
| Halogen | 2.262 | 1.779 | 2.745 | 0.683 | 2 | p < 0.1 (observed n) | 0.483 | 0.794 | 1.468 | 3.056 | 2.262 |
| Incandescent | 6.028 | 1.181 | 12.302 | 4.330 | 5 | p < 0.1 (observed n) | 1.937 | 3.185 | 2.842 | 9.213 | 6.028 |
| LED | 1.278 | 0.776 | 1.780 | 0.710 | 2 | p < 0.1 (observed n) | 0.502 | 0.826 | 0.452 | 2.103 | 1.278 |
| Energy saving - cfl | 2.015 | 0.110 | 8.562 | 2.308 | 12 | p < 0.05 (observed n) | 0.666 | 1.306 | 0.710 | 3.321 | 2.015 |
| Halogen | 2.262 | 1.779 | 2.745 | 0.683 | 2 | p < 0.05 (observed n) | 0.483 | 0.946 | 1.316 | 3.209 | 2.262 |
| Incandescent | 6.028 | 1.181 | 12.302 | 4.330 | 5 | p < 0.05 (observed n) | 1.937 | 3.796 | 2.232 | 9.823 | 6.028 |
| LED | 1.278 | 0.776 | 1.780 | 0.710 | 2 | p < 0.05 (observed n) | 0.502 | 0.984 | 0.294 | 2.261 | 1.278 |
# > Confidence intervals ----
z <- qnorm(0.975) # p = 0.05
p <- 0.4 #test a p
n <- 150
MoE <- z * sqrt(p*(1-p)/(n-1)) # calculate margin of error
MoE
## [1] 0.0786612
# > Power ----
# pwr.2p.test(h = , n = , sig.level =, power = )
# calculate for single sample
pwr::pwr.2p.test(h = , n = HEEP2full, sig.level = 0.05, power = 0.8)
##
## Difference of proportion power calculation for binomial distribution (arcsine transformation)
##
## h = 0.2367594
## n = 280
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: same sample sizes
# but this produces h which needs converting back to %
# single sample test
# n = n in the single sample
pwr.p.test(h = , n = HEEP2full, sig.level = 0.05, power = 0.8)
##
## proportion power calculation for binomial distribution (arcsine transformation)
##
## h = 0.1674264
## n = 280
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
Test n for different p and proportions within the single sample
# power.prop.test is easier to use
# calculate n for each group - e.g. % heat pumps in renters vs owner-occupiers
# 40% & 25%
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25,
power = 0.8, sig.level = 0.01)
##
## Two-sample comparison of proportions power calculation
##
## n = 226.2947
## p1 = 0.4
## p2 = 0.25
## sig.level = 0.01
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25,
power = 0.8, sig.level = 0.05)
##
## Two-sample comparison of proportions power calculation
##
## n = 151.8689
## p1 = 0.4
## p2 = 0.25
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25,
power = 0.8, sig.level = 0.10)
##
## Two-sample comparison of proportions power calculation
##
## n = 119.509
## p1 = 0.4
## p2 = 0.25
## sig.level = 0.1
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.4, p2 = 0.25,
power = 0.8, sig.level = 0.20)
##
## Two-sample comparison of proportions power calculation
##
## n = 87.00637
## p1 = 0.4
## p2 = 0.25
## sig.level = 0.2
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
message("# 10% & 15%")
## # 10% & 15%
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15,
power = 0.8, sig.level = 0.01)
##
## Two-sample comparison of proportions power calculation
##
## n = 1020.47
## p1 = 0.1
## p2 = 0.15
## sig.level = 0.01
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15,
power = 0.8, sig.level = 0.05)
##
## Two-sample comparison of proportions power calculation
##
## n = 685.5969
## p1 = 0.1
## p2 = 0.15
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15,
power = 0.8, sig.level = 0.10)
##
## Two-sample comparison of proportions power calculation
##
## n = 539.9264
## p1 = 0.1
## p2 = 0.15
## sig.level = 0.1
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
stats::power.prop.test(n = NULL, p1 = 0.1, p2 = 0.15,
power = 0.8, sig.level = 0.20)
##
## Two-sample comparison of proportions power calculation
##
## n = 393.5438
## p1 = 0.1
## p2 = 0.15
## sig.level = 0.2
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
Create a proportions plot and table
calculateProportionsMoE <- function(props, sig, samples){
# calculate margins of error given prop, a range of significance thresholds (sigs) and sample sizes
nProps <- length(props)
nSamps <- length(samples)
#initialise results array
resultsArray <- array(numeric(nSamps*nProps),
dim=c(nSamps,nProps)
)
# loop over samples
for (s in 1:nSamps){
# loop over significance values
for (p in 1:nProps){
me <- qnorm(1-(sig/2)) * sqrt(props[p]*(1 - props[p])/(samples[s]-1))
resultsArray[s,p] <- me # report effect size against sample size
}
}
dt <- data.table::as.data.table(resultsArray) # convert to dt for tidying
dt <- cbind(dt, samples)
longDT <- data.table::as.data.table(reshape2::melt(dt, id=c("samples")))
longDT <- data.table::setnames(longDT, "value", "moe")
longDT <- data.table::setnames(longDT, "variable", "proportion")
return(longDT) # returned the tidied & long form dt
}
samples <- seq(100,3000,10)
props <- c(0.1, 0.2, 0.3, 0.4, 0.5)
dt <- calculateProportionsMoE(props = props, # one sample
sig = 0.05,
samples = samples
)
# need to recode vars
# must be an easier way
dt[, prop := ifelse(proportion == "V1", "10%", NA)]
dt[, prop := ifelse(proportion == "V2", "20%", prop)]
dt[, prop := ifelse(proportion == "V3", "30%", prop)]
dt[, prop := ifelse(proportion == "V4", "40%", prop)]
dt[, prop := ifelse(proportion == "V5", "50%", prop)]
p <- ggplot2::ggplot(dt, aes(x = samples, y = 100*moe, colour = prop)) +
geom_point(alpha = 0.5) +
geom_line() +
labs(y = "Margin of error (+/-%)",
x = "Sample N (single sample)") +
scale_color_discrete(name="Proportion:") +
theme(legend.position="bottom") +
geom_vline(xintercept = HEEP2full, alpha = 0.3) +
geom_vline(xintercept = HEEP2fullAvail, alpha = 0.3) +
geom_vline(xintercept = HEEP2poolOb, alpha = 0.3)
p <- p + annotate(geom = "text",
x = HEEP2full,
y = 0.9*(max(p$data$moe)*100),
label = paste0("n =", HEEP2full),
hjust = "left") +
annotate(geom = "text",
x = HEEP2fullAvail,
y = 0.8*(max(p$data$moe)*100),
label = paste0("n = ", HEEP2fullAvail),
hjust = "left") +
annotate(geom = "text",
x = HEEP2poolOb,
y = 0.9*(max(p$data$moe)*100),
label = paste0("n = ", HEEP2poolOb),
hjust = "right")
ggplot2::ggsave("proportionsMoE.png", p,
width = 6, path = here::here("plots"))
## Saving 6 x 5 in image
p
# details
dt[samples == HEEP2full]
## samples proportion moe prop
## 1: 280 V1 0.03520199 10%
## 2: 280 V2 0.04693599 20%
## 3: 280 V3 0.05377193 30%
## 4: 280 V4 0.05748461 40%
## 5: 280 V5 0.05866999 50%
dt[samples == HEEP2fullAvail]
## samples proportion moe prop
## 1: 700 V1 0.02223979 10%
## 2: 700 V2 0.02965306 20%
## 3: 700 V3 0.03397185 30%
## 4: 700 V4 0.03631743 40%
## 5: 700 V5 0.03706632 50%
dt[samples == HEEP2poolOb]
## samples proportion moe prop
## 1: 2800 V1 0.01111394 10%
## 2: 2800 V2 0.01481858 20%
## 3: 2800 V3 0.01697682 30%
## 4: 2800 V4 0.01814898 40%
## 5: 2800 V5 0.01852323 50%